install.packages("FactoMineR")
# Para generar gráficos alternativos
install.packages("factoextra")library("FactoMineR")
library("factoextra")# Valores de los gráficos por defecto
mi.tema <- theme_grey() + theme(panel.border = element_rect(fill = NA,color = "black"), plot.title = element_text(hjust = 0.5))setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.table('EjemploEstudiantes.csv', header=TRUE, sep=';',dec=',',row.names=1)
modelo <- prcomp(Datos,scale. = TRUE,center = TRUE)
modelo## Standard deviations (1, .., p=5):
## [1] 1.70095552 1.27618589 0.58872409 0.35016062 0.09429419
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4 PC5
## Matematicas -0.5266440 -0.27049630 0.43820071 -0.26121779 -0.6238776
## Ciencias -0.4249362 -0.50807221 0.04049491 0.67362724 0.3253895
## Espanol -0.3591470 0.56208159 0.56227583 -0.07008647 0.4837473
## Historia -0.3526975 0.58648985 -0.39418032 0.44664495 -0.4204335
## EdFisica 0.5373018 0.09374599 0.57862603 0.52305619 -0.3067941
biplot(modelo)fviz_pca_biplot(modelo,col.var = "#2E9FDF",col.ind = "#696969",ggtheme = mi.tema)setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.table('EjemploClientesCorregidaEdad.csv', header=TRUE, sep=';',dec=',',row.names=1)
modelo <- prcomp(Datos)biplot(modelo)fviz_pca_biplot(modelo,col.var = "#2E9FDF",col.ind = "#696969",ggtheme = mi.tema)setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.table('EjemploEstudiantes.csv', header=TRUE, sep=';',dec=',',row.names=1)
res<-PCA(Datos, scale.unit=TRUE, ncp=5, graph = FALSE)plot(res, axes=c(1, 2), choix="ind", col.ind="red",new.plot=TRUE)plot(res, axes=c(1, 2), choix="var", col.var="blue",new.plot=TRUE)fviz_pca_ind(res, pointsize = 5, pointshape = 21, fill = "#E7B800", repel = TRUE,ggtheme = mi.tema)fviz_pca_var(res,col.var="steelblue",ggtheme = mi.tema)res## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 10 individuals, described by 5 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
res$eig## eigenvalue percentage of variance
## comp 1 2.893249673 57.8649935
## comp 2 1.628650425 32.5730085
## comp 3 0.346596049 6.9319210
## comp 4 0.122612460 2.4522492
## comp 5 0.008891393 0.1778279
## cumulative percentage of variance
## comp 1 57.86499
## comp 2 90.43800
## comp 3 97.36992
## comp 4 99.82217
## comp 5 100.00000
res$var## $coord
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Matematicas 0.8957980 -0.3452036 0.25797931 -0.09146818 0.05882803
## Ciencias 0.7227976 -0.6483946 0.02384033 0.23587773 -0.03068234
## Espanol 0.6108931 0.7173206 0.33102532 -0.02454152 -0.04561456
## Historia 0.5999227 0.7484701 -0.23206345 0.15639747 0.03964443
## EdFisica -0.9139265 0.1196373 0.34065108 0.18315368 0.02892890
##
## $cor
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Matematicas 0.8957980 -0.3452036 0.25797931 -0.09146818 0.05882803
## Ciencias 0.7227976 -0.6483946 0.02384033 0.23587773 -0.03068234
## Espanol 0.6108931 0.7173206 0.33102532 -0.02454152 -0.04561456
## Historia 0.5999227 0.7484701 -0.23206345 0.15639747 0.03964443
## EdFisica -0.9139265 0.1196373 0.34065108 0.18315368 0.02892890
##
## $cos2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Matematicas 0.8024540 0.11916550 0.0665533242 0.0083664287 0.0034607374
## Ciencias 0.5224364 0.42041555 0.0005683612 0.0556383052 0.0009414059
## Espanol 0.3731904 0.51454884 0.1095777630 0.0006022863 0.0020806881
## Historia 0.3599073 0.56020745 0.0538534429 0.0244601695 0.0015716811
## EdFisica 0.8352616 0.01431309 0.1160431572 0.0335452699 0.0008368811
##
## $contrib
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Matematicas 27.73539 7.3168250 19.2019858 6.8234735 38.92233
## Ciencias 18.05708 25.8137375 0.1639838 45.3773665 10.58783
## Espanol 12.89866 31.5935718 31.6154103 0.4912113 23.40115
## Historia 12.43955 34.3970346 15.5378121 19.9491712 17.67643
## EdFisica 28.86932 0.8788311 33.4808079 27.3587774 9.41226
res$ind## $coord
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Lucia 0.32306263 1.7725245 1.19880074 -0.05501532 0.003633384
## Pedro 0.66544057 -1.6387021 0.14547628 -0.02306468 -0.123377296
## Ines 1.00254705 -0.5156925 0.62888764 0.51644351 0.142875824
## Luis -3.17209481 -0.2627820 -0.38196027 0.67777691 -0.062503554
## Andres -0.48886797 1.3654021 -0.83523570 -0.15579197 0.123367255
## Ana 1.70863322 -1.0217004 -0.12707707 0.06683295 0.025291503
## Carlos 0.06758577 1.4623364 -0.50624044 -0.11792847 0.013123980
## Jose 2.01185516 -1.2758646 -0.54215002 -0.19778670 0.017434170
## Sonia -3.04203029 -1.2548807 0.44882861 -0.63999876 0.037884840
## Maria 0.92386867 1.3693593 -0.02932977 -0.07146746 -0.177730107
##
## $cos2
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Lucia 0.022270827 0.670420670 0.306659839 0.0006458478 2.816992e-06
## Pedro 0.139905502 0.848430539 0.006686527 0.0001680781 4.809354e-03
## Ines 0.514468899 0.136122895 0.202439714 0.1365196756 1.044882e-02
## Luis 0.936851990 0.006429392 0.013583605 0.0427712757 3.637375e-04
## Andres 0.084139511 0.656353715 0.245603703 0.0085448999 5.358172e-03
## Ana 0.732686110 0.261979570 0.004052795 0.0011209894 1.605349e-04
## Carlos 0.001892733 0.886081139 0.106192189 0.0057625700 7.136907e-05
## Jose 0.673612108 0.270910359 0.048916504 0.0065104446 5.058468e-05
## Sonia 0.808829929 0.137636943 0.017607237 0.0358004434 1.254472e-04
## Maria 0.308554271 0.677869212 0.000310977 0.0018464085 1.141913e-02
##
## $contrib
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Lucia 0.36073437 19.2910834 41.46392357 0.24684974 0.01484748
## Pedro 1.53049754 16.4881591 0.61060555 0.04338706 17.11987788
## Ines 3.47395038 1.6328779 11.41096846 21.75259335 22.95871968
## Luis 34.77814436 0.4239976 4.20932799 37.46613853 4.39379307
## Andres 0.82603273 11.4470414 20.12771563 1.97950024 17.11709152
## Ana 10.09047896 6.4094282 0.46591936 0.36428947 0.71941493
## Carlos 0.01578791 13.1300601 7.39418080 1.13423412 0.19371414
## Jose 13.98967133 9.9949649 8.48038057 3.19050613 0.34184774
## Sonia 31.98461714 9.6688984 5.81215853 33.40593699 1.61421395
## Maria 2.95008527 11.5134890 0.02481953 0.41656436 35.52647960
##
## $dist
## Lucia Pedro Ines Luis Andres Ana Carlos Jose
## 2.164804 1.779065 1.397736 3.277258 1.685356 1.996135 1.553497 2.451273
## Sonia Maria
## 3.382478 1.663200
setwd("~/Google Drive/MDCurso/Datos")
# Ejemplo de las importaciones de México
Datos <- read.table('ImportacionesMexico.csv', header=TRUE, sep=';',dec=',',row.names=1)
res<-PCA(Datos, scale.unit=TRUE, ncp=5, graph = FALSE)
cos2.ind<-(res$ind$cos2[,1]+res$ind$cos2[,2])*100
cos2.ind## 1979 1980 1981 1982 1983 1984
## 96.1000354 86.9743777 80.6523653 70.7456314 65.6042706 85.9034700
## 1985 1986 1987 1988
## 75.1710823 81.9688762 4.9898384 0.8100985
# Grafica los individuos que tengan cos2 >= 0.1 (10%)
plot(res, axes=c(1, 2), choix="ind",col.ind="red",new.plot=TRUE,select="cos2 0.1")# Grafica los individuos que tengan cos2 >= 0.1 (10%)
fviz_pca_ind(res, pointsize = 5, pointshape = 21, fill = "#E7B800", repel = TRUE, select.ind = list(cos2 = 0.1),ggtheme = mi.tema)cos2.var<-(res$var$cos2[,1]+res$var$cos2[,2])*100
cos2.var## Costa.Rica El.Salvador Guatemala Honduras Nicaragua Panama
## 73.87060 89.22701 71.89417 67.23724 77.57747 90.95865
# Grafica las variables que tengan cos2 >= 0.1 (10%)
plot(res, axes=c(1, 2), choix="var",col.var="blue",new.plot=TRUE,select="cos2 0.1")# Grafica las variables que tengan cos2 >= 0.7 (70%)
plot(res, axes=c(1, 2), choix="var",col.var="blue",new.plot=TRUE,select="cos2 0.7")# Grafica las variables que tengan cos2 >= 0.1 (10%)
fviz_pca_var(res,col.var="steelblue", select.var = list(cos2 = 0.1),ggtheme = mi.tema)# Grafica las variables que tengan cos2 >= 0.7 (70%)
fviz_pca_var(res,col.var="steelblue", select.var = list(cos2 = 0.7),ggtheme = mi.tema)setwd("~/Google Drive/MDCurso/Datos")
Datos<-read.table("EjemploEstudiantes.csv",header=TRUE,sep=";",dec=",",row.names=1)
Datos Matematicas Ciencias Espanol Historia EdFisica
Lucia 7.0 6.5 9.2 8.6 8.0
Pedro 7.5 9.4 7.3 7.0 7.0
Ines 7.6 9.2 8.0 8.0 7.5
Luis 5.0 6.5 6.5 7.0 9.0
Andres 6.0 6.0 7.8 8.9 7.3
Ana 7.8 9.6 7.7 8.0 6.5
Carlos 6.3 6.4 8.2 9.0 7.2
Jose 7.9 9.7 7.5 8.0 6.0
Sonia 6.0 6.0 6.5 5.5 8.7
Maria 6.8 7.2 8.7 9.0 7.0
matriz.distacias <- dist(Datos)
matriz.distacias Lucia Pedro Ines Luis Andres Ana
Pedro 3.9786933
Ines 3.1144823 1.3379088
Luis 3.8535698 4.3931765 4.4215382
Andres 1.9467922 4.2142615 3.7000000 3.0724583
Ana 3.8871583 1.2409674 1.1357817 5.1127292 4.2023803
Carlos 1.5165751 3.9102430 3.2649655 3.4394767 0.6557439 3.7722672
Jose 4.2778499 1.5132746 1.6852300 5.4451814 4.4598206 0.5567764
Sonia 4.3174066 4.4260592 4.7686476 1.8947295 3.9000000 5.3600373
Maria 1.3928388 3.3600595 2.5258662 4.0706265 1.7291616 3.0016662
Carlos Jose Sonia
Pedro
Ines
Luis
Andres
Ana
Carlos
Jose 4.0472213
Sonia 4.2000000 5.6426944
Maria 1.0862780 3.3015148 4.6968074
# cmdscale ejecuta el ACP (MDS) pero desde la matriz de distacias
res <- cmdscale(matriz.distacias,eig=TRUE, k=2) # k es el número de componentes a usas
res # Ver las componentes y más$points
[,1] [,2]
Lucia 0.76471745 1.5817637
Pedro -1.66887794 -1.3919656
Ines -1.57822841 -0.2994960
Luis 2.60701317 -1.3202040
Andres 1.43877557 1.3356687
Ana -2.34790534 -0.3880845
Carlos 0.89372557 1.5189012
Jose -2.64984571 -0.4254636
Sonia 2.62959083 -2.1833951
Maria -0.08896518 1.5722752
$eig
[1] 3.498309e+01 1.793415e+01 2.708155e+00 1.511504e+00 7.710194e-02
[6] 3.775779e-15 2.673082e-15 2.327544e-15 -9.116253e-16 -3.093847e-15
$x
NULL
$ac
[1] 0
$GOF
[1] 0.9249002 0.9249002
# Plotear la solución
x <- res$points[,1]
y <- res$points[,2]
plot(x, y, xlab="Componente 1", ylab="Componente 2",main="MDS",pch = 19)
text(x, y, labels = row.names(Datos), cex=0.85 , pos = 1) res2 <- PCA(Datos, scale.unit=FALSE, ncp=5, graph = FALSE)
res2$ind$coord## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Lucia -0.76471745 1.5817637 1.11186219 0.039908252 0.0007926494
## Pedro 1.66887794 -1.3919656 0.09067929 -0.053941739 -0.1128975404
## Ines 1.57822841 -0.2994960 0.48752985 0.552652786 0.1377925976
## Luis -2.60701317 -1.3202040 -0.46230941 0.784004734 -0.0524431575
## Andres -1.43877557 1.3356687 -0.67985389 -0.189277341 0.1117444886
## Ana 2.34790534 -0.3880845 -0.12895699 0.007628838 0.0253542859
## Carlos -0.89372557 1.5189012 -0.38893244 -0.124247926 0.0093618642
## Jose 2.64984571 -0.4254636 -0.46447580 -0.316066094 0.0162142654
## Sonia -2.62959083 -2.1833951 0.40705140 -0.658738298 0.0317927290
## Maria 0.08896518 1.5722752 0.02740580 -0.041923214 -0.1677121823
plot(res2, axes=c(1, 2), choix="ind", col.ind="red",new.plot=TRUE)fviz_pca_ind(res2, pointsize = 5, pointshape = 21, fill = "#E7B800", repel = TRUE, select.ind = list(cos2 = 0.1),ggtheme = mi.tema)# Comparación con el ACP
res2<-PCA(Datos, scale.unit=TRUE, ncp=5, graph = FALSE)
res2$ind$coord## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## Lucia 0.32306263 1.7725245 1.19880074 -0.05501532 0.003633384
## Pedro 0.66544057 -1.6387021 0.14547628 -0.02306468 -0.123377296
## Ines 1.00254705 -0.5156925 0.62888764 0.51644351 0.142875824
## Luis -3.17209481 -0.2627820 -0.38196027 0.67777691 -0.062503554
## Andres -0.48886797 1.3654021 -0.83523570 -0.15579197 0.123367255
## Ana 1.70863322 -1.0217004 -0.12707707 0.06683295 0.025291503
## Carlos 0.06758577 1.4623364 -0.50624044 -0.11792847 0.013123980
## Jose 2.01185516 -1.2758646 -0.54215002 -0.19778670 0.017434170
## Sonia -3.04203029 -1.2548807 0.44882861 -0.63999876 0.037884840
## Maria 0.92386867 1.3693593 -0.02932977 -0.07146746 -0.177730107
plot(res2, axes=c(1, 2), choix="ind", col.ind="red",new.plot=TRUE)fviz_pca_ind(res2, pointsize = 5, pointshape = 21, fill = "#E7B800", repel = TRUE, select.ind = list(cos2 = 0.1),ggtheme = mi.tema)UScitiesD Atlanta Chicago Denver Houston LosAngeles Miami NewYork
Chicago 587
Denver 1212 920
Houston 701 940 879
LosAngeles 1936 1745 831 1374
Miami 604 1188 1726 968 2339
NewYork 748 713 1631 1420 2451 1092
SanFrancisco 2139 1858 949 1645 347 2594 2571
Seattle 2182 1737 1021 1891 959 2734 2408
Washington.DC 543 597 1494 1220 2300 923 205
SanFrancisco Seattle
Chicago
Denver
Houston
LosAngeles
Miami
NewYork
SanFrancisco
Seattle 678
Washington.DC 2442 2329
UScitiesD.matriz <- as.matrix(UScitiesD)
UScitiesD.matriz Atlanta Chicago Denver Houston LosAngeles Miami NewYork
Atlanta 0 587 1212 701 1936 604 748
Chicago 587 0 920 940 1745 1188 713
Denver 1212 920 0 879 831 1726 1631
Houston 701 940 879 0 1374 968 1420
LosAngeles 1936 1745 831 1374 0 2339 2451
Miami 604 1188 1726 968 2339 0 1092
NewYork 748 713 1631 1420 2451 1092 0
SanFrancisco 2139 1858 949 1645 347 2594 2571
Seattle 2182 1737 1021 1891 959 2734 2408
Washington.DC 543 597 1494 1220 2300 923 205
SanFrancisco Seattle Washington.DC
Atlanta 2139 2182 543
Chicago 1858 1737 597
Denver 949 1021 1494
Houston 1645 1891 1220
LosAngeles 347 959 2300
Miami 2594 2734 923
NewYork 2571 2408 205
SanFrancisco 0 678 2442
Seattle 678 0 2329
Washington.DC 2442 2329 0
# cmdscale ejecuta el ACP (MDS) pero desde la matriz de distacias
res <- cmdscale(UScitiesD,eig=TRUE, k=2) # k es el número de componentes a usas
# Plotear la solución
x <- res$points[,1]
y <- res$points[,2]
plot(x, y, xlab="Componente 1", ylab="Componente 2",main="MDS",pch = 19)
text(x, y, labels = row.names(UScitiesD.matriz), cex=0.75 , pos = 1) # Rotar la solución
# Plotear la solución
x <- -x
y <- -y
plot(x, y, xlab="Componente 1", ylab="Componente 2",main="MDS",pch = 19)
text(x, y, labels = row.names(UScitiesD.matriz), cex=0.75 , pos = 1) setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.table('EjemploEstudiantes.csv', header=TRUE, sep=';',dec=',',row.names=1)
modelo <- hclust(dist(Datos),method = "complete")plot(modelo)plot(modelo,hang = -1)
# la siguiente instrucción separa los clústeres usando 3
rect.hclust(modelo, k=3, border="red")fviz_dend(modelo, cex = 1.3,ggtheme = mi.tema)# la siguiente instrucción separa los clústeres usando 3
fviz_dend(modelo, k = 3, cex = 1.3, color_labels_by_k = FALSE, rect = TRUE,k_colors = c("#1B9E77", "#D95F02", "#7570B3"), ggtheme = mi.tema)setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.table('EjemploEstudiantes.csv', header=TRUE, sep=';',dec=',',row.names=1)
modelo <- hclust(dist(Datos),method = "single")plot(modelo,hang=-1)
rect.hclust(modelo, k=3, border="blue")fviz_dend(modelo, k = 3, cex = 1.3, color_labels_by_k = FALSE, rect = TRUE,k_colors = c("#1B9E77", "#D95F02", "#7570B3"), ggtheme = mi.tema)setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.table('EjemploEstudiantes.csv', header=TRUE, sep=';',dec=',',row.names=1)
modelo <- hclust(dist(Datos),method = "average")plot(modelo)
rect.hclust(modelo, k=3, border="green")fviz_dend(modelo, k = 3, cex = 1.3, color_labels_by_k = FALSE, rect = TRUE,k_colors = c("#1B9E77", "#D95F02", "#7570B3"), ggtheme = mi.tema)setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.table('EjemploEstudiantes.csv', header=TRUE, sep=';',dec=',',row.names=1)
modelo <- hclust(dist(Datos),method= "ward.D")plot(modelo,hang=-1)
rect.hclust(modelo, k=3, border="magenta")fviz_dend(modelo, k = 3, cex = 1.3, color_labels_by_k = FALSE, rect = TRUE,k_colors = c("#1B9E77", "#D95F02", "#7570B3"), ggtheme = mi.tema)# cutree corta el el árbol con k clústeres
Grupo<-cutree(modelo,k=3)
NDatos<-cbind(Datos,Grupo)
NDatos## Matematicas Ciencias Espanol Historia EdFisica Grupo
## Lucia 7.0 6.5 9.2 8.6 8.0 1
## Pedro 7.5 9.4 7.3 7.0 7.0 2
## Ines 7.6 9.2 8.0 8.0 7.5 2
## Luis 5.0 6.5 6.5 7.0 9.0 3
## Andres 6.0 6.0 7.8 8.9 7.3 1
## Ana 7.8 9.6 7.7 8.0 6.5 2
## Carlos 6.3 6.4 8.2 9.0 7.2 1
## Jose 7.9 9.7 7.5 8.0 6.0 2
## Sonia 6.0 6.0 6.5 5.5 8.7 3
## Maria 6.8 7.2 8.7 9.0 7.0 1
# Establezco el directorio en donde se quiere grabar el archivo
setwd("~/Google Drive/MDCurso/Datos")
# Se graba el archivo en como un CSV
write.csv(NDatos,"NDatos.csv")# Ejemplo de las importaciones de México
setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.table('ImportacionesMexico.csv', header=TRUE, sep=';',dec=',',row.names=1)
res <- PCA(Datos , scale.unit=TRUE, ncp=5, graph = FALSE)
res.hcpc <- HCPC(res, nb.clust = -1, consol = TRUE, min = 3, max = 3, graph = FALSE)plot.HCPC(res.hcpc, choice="bar")plot.HCPC(res.hcpc, choice="map",select="cos2 0.1")fviz_cluster(res.hcpc,repel = TRUE,show.clust.cent = TRUE,palette = "jco",main = "Factor map",geom = "text", select.ind = list(cos2 = 0.1))plot.HCPC(res.hcpc, choice="3D.map", angle=60)library(cluster) # Para menejo de clusteres# Función para encontrar el centroide de cada cluster
centroide <- function(num.cluster, datos, clusters) {
ind <- (clusters == num.cluster)
return(colMeans(datos[ind,]))
}
setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.table('EjemploEstudiantes.csv', header=TRUE, sep=';',dec=',',row.names=1)
modelo <- hclust(dist(Datos),method= "ward.D")
grupos <- cutree(modelo, k=3)
grupos## Lucia Pedro Ines Luis Andres Ana Carlos Jose Sonia Maria
## 1 2 2 3 1 2 1 2 3 1
centro.cluster1<-centroide(1,Datos,grupos)
centro.cluster1## Matematicas Ciencias Espanol Historia EdFisica
## 6.525 6.525 8.475 8.875 7.375
centro.cluster2<-centroide(2,Datos,grupos)
centro.cluster2## Matematicas Ciencias Espanol Historia EdFisica
## 7.700 9.475 7.625 7.750 6.750
centro.cluster3<-centroide(3,Datos,grupos)
centro.cluster3## Matematicas Ciencias Espanol Historia EdFisica
## 5.50 6.25 6.50 6.25 8.85
centros<-rbind(centro.cluster1,centro.cluster2,centro.cluster3)
centros## Matematicas Ciencias Espanol Historia EdFisica
## centro.cluster1 6.525 6.525 8.475 8.875 7.375
## centro.cluster2 7.700 9.475 7.625 7.750 6.750
## centro.cluster3 5.500 6.250 6.500 6.250 8.850
color <- c("#ECD078","#D95B43","#C02942","#542437","#53777A")
parTemp <- par(bg = "#4D4545")
barplot(centros[1,],col=color,las=2, cex.names = 0.8, ylim = c(0,10))barplot(centros[2,],col=color,las=2, cex.names = 0.8, ylim = c(0,10))barplot(centros[3,],beside=TRUE,col=color,las=2, cex.names = 0.8, ylim = c(0,10))barplot(t(centros),beside=TRUE,col=color, cex.names = 0.8, ylim = c(0,10))par(parTemp)centros<-as.data.frame(centros)
maximos<-apply(centros,2,max)
minimos<-apply(centros,2,min)
centros<-rbind(minimos,centros)
centros<-rbind(maximos,centros)
centros## Matematicas Ciencias Espanol Historia EdFisica
## 1 7.700 9.475 8.475 8.875 8.850
## 11 5.500 6.250 6.500 6.250 6.750
## centro.cluster1 6.525 6.525 8.475 8.875 7.375
## centro.cluster2 7.700 9.475 7.625 7.750 6.750
## centro.cluster3 5.500 6.250 6.500 6.250 8.850
library(fmsb) # Paquete para usar radarchartcolor <- c("#CC333F","#EB6841","#EDC951")
radarchart(as.data.frame(centros),maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8, cglcol="gray67",
pcol=color,plty=1,plwd=5,title="Comparación de clústeres")
legenda <-legend(1.5,1, legend=c("Cluster 1","Cluster 2","Cluster 3"),
seg.len=-1.4,title="Clústeres",pch=21,bty="n" ,lwd=3, y.intersp=1,
horiz=FALSE,col=color)setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.table('EjemploClientesCorregidaEdad.csv', header=TRUE, sep=';',dec=',',row.names=1)
modelo <- hclust(dist(Datos),method= "ward.D")plot(modelo,las=1,hang=-1)fviz_dend(modelo, cex = 1,ggtheme = mi.tema)# Para encontrar el centroide de cada cluster
grupos <- cutree(modelo, k=3)
centro.cluster1<-centroide(1,Datos,grupos)
centro.cluster2<-centroide(2,Datos,grupos)
centro.cluster3<-centroide(3,Datos,grupos)
centros<-rbind(centro.cluster1,centro.cluster2,centro.cluster3)
color <- c("#FF6449", "#FEB035", "#FCE020", "#F7EC69", "#F1F8BE","#D5B9F6",
"#A2E3CD","#EDF380", "#D8FD9C", "#AEEC64", "#F598F8", "#9EFF37")
parTemp <- par(bg = "#4D4545")
barplot(centros[1,],col=color,las=2,cex.names = 0.65, ylim = c(0,12))barplot(centros[2,],col=color,las=2, cex.names = 0.65, ylim = c(0,12))barplot(centros[3,],beside=TRUE,col=color,las=2, cex.names = 0.65, ylim = c(0,12))barplot(t(centros),beside=TRUE,legend=colnames(Datos),main = "Gráfico de Interpretación de Clases",col=color, cex.names = 0.65, ylim = c(0,25))par(parTemp)centros<-as.data.frame(centros)
maximos<-apply(centros,2,max)
minimos<-apply(centros,2,min)
centros<-rbind(minimos,centros)
centros<-rbind(maximos,centros)
centros## Edad.10 Antiguedad Espacios.Parqueo Velocidad.Cajas
## 1 3.080000 5.833333 6.850000 8.34
## 11 2.360000 0.600000 5.466667 7.44
## centro.cluster1 2.360000 0.600000 6.220000 8.34
## centro.cluster2 3.066667 5.833333 6.850000 8.10
## centro.cluster3 3.080000 3.000000 5.466667 7.44
## Distribucion.Productos Atencion.Empleados
## 1 8.120000 9.713333
## 11 4.626667 9.508333
## centro.cluster1 8.120000 9.700000
## centro.cluster2 7.600000 9.508333
## centro.cluster3 4.626667 9.713333
## Calidad.Instalaciones Ubicacion Limpieza
## 1 4.700000 9.160000 7.450000
## 11 2.406667 8.833333 5.626667
## centro.cluster1 4.700000 9.160000 7.360000
## centro.cluster2 3.500000 8.833333 7.450000
## centro.cluster3 2.406667 9.026667 5.626667
## Variedad.Productos Prestigio.Empresa Calidad.Servicio
## 1 7.466667 8.520000 5.325
## 11 5.960000 5.426667 4.960
## centro.cluster1 7.440000 8.520000 5.070
## centro.cluster2 7.466667 7.933333 5.325
## centro.cluster3 5.960000 5.426667 4.960
color <- c("#61492D","#939C53","#F3D079")
radarchart(as.data.frame(centros),maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8, cglcol="gray67",
pcol=color,plty=1,plwd=5,title="Comparación de clústeres")
legenda <-legend(1.5,1, legend=c("Cluster 1","Cluster 2","Cluster 3"),
seg.len=-1.4,title="Clústeres",pch=21,bty="n" ,lwd=3, y.intersp=1,
horiz=FALSE,col=color)setwd("~/Google Drive/MDCurso/Datos")
datos<-read.csv("Ej1kmeans.csv",sep = ";",header=F)
head(datos)## V1 V2
## 1 -0.3508569 0.3348495
## 2 0.4312463 -0.2319262
## 3 -0.2342527 -0.1372323
## 4 -0.2942210 -0.4817277
## 5 -0.1512742 0.3637852
## 6 0.3247037 -0.1647549
grupos<-kmeans(datos,centers=2,iter.max=100) ## iter.max por defecto es 10
grupos## K-means clustering with 2 clusters of sizes 51, 49
##
## Cluster means:
## V1 V2
## 1 0.02169424 0.08865999
## 2 0.99128291 1.07898833
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 7.489019 9.397754
## (between_SS / total_SS = 74.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
grupos$cluster## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
grupos$centers## V1 V2
## 1 0.02169424 0.08865999
## 2 0.99128291 1.07898833
grupos$totss # Inercia Total## [1] 64.88887
grupos$withinss # Inercia Intra-clases por grupo (una para cada grupo)## [1] 7.489019 9.397754
grupos$tot.withinss # Inercia Intra-clases## [1] 16.88677
grupos$betweenss # inercia Inter-clases## [1] 48.0021
# Verificación del Teorema de Fisher
grupos$totss==grupos$tot.withinss+grupos$betweenss ## [1] TRUE
grupos$size # Tamaño de las clases## [1] 51 49
plot(datos,pch=19,xlab=expression(x[1]),ylab=expression(x[2]))
points(grupos$centers,pch=19,col="#FF9C5B",cex=2)
points(datos,col=grupos$cluster+1,pch=19)datos.escalado <- data.frame(scale(datos), grupos$cluster)
clusplot(datos.escalado, grupos$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)parTemp <- par(bg = "#4D4545")
barplot(grupos$centers[1,],col='#F5634A')barplot(grupos$centers[2,],col='#ED303C')rownames(grupos$centers)<-c("Cluster 1","Cluster 2")
barplot(t(grupos$centers),beside=TRUE,col=c("#F5634A","#F5634A","#ED303C","#ED303C"))par(parTemp)## con iter.max=10 algunas veces encuentra soluciones suboptimales
grupos<-kmeans(datos,centers=4,iter.max=50)
grupos$cluster## [1] 3 4 3 3 3 4 3 3 4 3 3 3 4 4 3 4 3 3 3 4 4 4 4 3 3 4 3 3 4 3 4 3 3 3 3
## [36] 3 4 3 3 4 3 3 4 3 3 3 4 3 3 3 1 1 1 1 1 2 1 4 2 2 1 1 1 2 2 2 1 2 2 2
## [71] 2 2 2 1 2 1 1 1 1 2 2 1 2 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 2 1
grupos$centers## V1 V2
## 1 0.8136006 1.205556332
## 2 1.2489223 0.895464737
## 3 -0.1479986 0.132090045
## 4 0.3327978 0.009038232
plot(datos,pch=19,xlab=expression(x[1]),ylab=expression(x[2]))
points(grupos$centers,pch=19,col="black",cex=2)
points(datos,col=grupos$cluster+1,pch=19)datos.escalado<- data.frame(scale(datos), grupos$cluster)
clusplot(datos.escalado, grupos$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)setwd("~/Google Drive/MDCurso/Datos")
datos <- read.csv("EjemploClientesCorregidaEdad.csv",header=TRUE, sep=";", dec=",", row.names=1)
str(datos)## 'data.frame': 37 obs. of 12 variables:
## $ Edad.10 : num 2.5 2.4 2.8 2.3 4.9 3.2 2.6 2.3 2.5 2.9 ...
## $ Antiguedad : int 1 0 7 0 6 4 0 4 4 0 ...
## $ Espacios.Parqueo : num 7.6 4.8 6.8 3.4 7 5.6 6.2 5.6 4.6 5.4 ...
## $ Velocidad.Cajas : num 7.6 9 8.4 7.8 3.2 7.8 8 6.8 8 6.4 ...
## $ Distribucion.Productos: num 7.8 7.2 7.6 9 1.2 6.8 6.6 6.2 3.8 8.8 ...
## $ Atencion.Empleados : num 9.7 10 8.7 10 10 10 9.3 9.7 10 9.7 ...
## $ Calidad.Instalaciones : num 5 2 2.7 1 4 3 3.3 4 1.7 6.7 ...
## $ Ubicacion : num 9 9.6 9.2 10 9 10 8.6 6.8 9.8 10 ...
## $ Limpieza : num 7.6 6.8 6.2 4.4 1.4 5 7.8 6.8 5 5.6 ...
## $ Variedad.Productos : num 5.6 8.4 9 4 4.8 4.2 6.4 7.4 4.4 6.2 ...
## $ Prestigio.Empresa : num 7 9.8 9.6 2.8 2.6 4.2 9.6 5.6 6 8.4 ...
## $ Calidad.Servicio : num 6.6 5.4 8.5 5.4 3.3 7.2 6.5 4.5 7.6 6.5 ...
grupos<-kmeans(datos,centers=3,iter.max=200)
grupos$cluster # Cluster al que pertenece cada fila## Ariana Guiselle Francisco Griselda Damaris Johana
## 3 3 2 1 1 1
## Bernal Freddy Estafania Laura Arnoldo Beatriz
## 3 2 1 3 2 2
## Rebeca Sofia Ingrid Rocio Karen Luis
## 3 2 2 1 2 3
## Pedro Lorena Elena Julian Natalie Shirley
## 3 1 2 1 1 1
## Andres Alejandro Grace Nuria Flor Roberto
## 1 2 1 3 3 2
## Victor Arturo Maritza Diana Juan Guillermo
## 3 2 3 1 1 3
## Silvia
## 3
grupos$centers # Centros de los clústeres## Edad.10 Antiguedad Espacios.Parqueo Velocidad.Cajas
## 1 3.169231 2.769231 5.400000 7.461538
## 2 3.118182 6.181818 6.527273 8.018182
## 3 2.392308 1.307692 6.492308 8.230769
## Distribucion.Productos Atencion.Empleados Calidad.Instalaciones
## 1 4.476923 9.692308 2.084615
## 2 6.927273 9.554545 3.845455
## 3 8.261538 9.669231 4.284615
## Ubicacion Limpieza Variedad.Productos Prestigio.Empresa Calidad.Servicio
## 1 9.200000 5.646154 5.723077 5.261538 5.069231
## 2 8.509091 6.818182 7.200000 7.618182 4.781818
## 3 9.215385 7.615385 7.676923 8.430769 5.423077
colnames(datos) ## [1] "Edad.10" "Antiguedad"
## [3] "Espacios.Parqueo" "Velocidad.Cajas"
## [5] "Distribucion.Productos" "Atencion.Empleados"
## [7] "Calidad.Instalaciones" "Ubicacion"
## [9] "Limpieza" "Variedad.Productos"
## [11] "Prestigio.Empresa" "Calidad.Servicio"
color <- c("#FF6449", "#FEB035", "#FCE020", "#F7EC69", "#F1F8BE","#D5B9F6",
"#A2E3CD","#EDF380", "#D8FD9C", "#AEEC64", "#F598F8", "#9EFF37")
parTemp <- par(bg = "#4D4545")
barplot(grupos$centers[1,],col=color,las=2,cex.names = 0.7, ylim = c(0,12))barplot(grupos$centers[2,],col=color,las=2,cex.names = 0.7, ylim = c(0,12))barplot(grupos$centers[3,],col=color,las=2,cex.names = 0.7, ylim = c(0,12))rownames(grupos$centers)<-c("Cluster 1","Cluster 2","Cluster 3")
barplot(t(grupos$centers),beside=TRUE,legend=colnames(datos),main = "Gráfico de Interpretación de Clases",col=color,ylim=c(0,30),cex.names = 0.7)par(parTemp)# Gráfico Tipo Araña
centros<-grupos$centers
rownames(centros)<-c("Cluster 1","Cluster 2","Cluster 3")
centros<-as.data.frame(centros)
maximos<-apply(centros,2,max)
minimos<-apply(centros,2,min)
centros<-rbind(minimos,centros)
centros<-rbind(maximos,centros)
centros## Edad.10 Antiguedad Espacios.Parqueo Velocidad.Cajas
## 1 3.169231 6.181818 6.527273 8.230769
## 11 2.392308 1.307692 5.400000 7.461538
## Cluster 1 3.169231 2.769231 5.400000 7.461538
## Cluster 2 3.118182 6.181818 6.527273 8.018182
## Cluster 3 2.392308 1.307692 6.492308 8.230769
## Distribucion.Productos Atencion.Empleados Calidad.Instalaciones
## 1 8.261538 9.692308 4.284615
## 11 4.476923 9.554545 2.084615
## Cluster 1 4.476923 9.692308 2.084615
## Cluster 2 6.927273 9.554545 3.845455
## Cluster 3 8.261538 9.669231 4.284615
## Ubicacion Limpieza Variedad.Productos Prestigio.Empresa
## 1 9.215385 7.615385 7.676923 8.430769
## 11 8.509091 5.646154 5.723077 5.261538
## Cluster 1 9.200000 5.646154 5.723077 5.261538
## Cluster 2 8.509091 6.818182 7.200000 7.618182
## Cluster 3 9.215385 7.615385 7.676923 8.430769
## Calidad.Servicio
## 1 5.423077
## 11 4.781818
## Cluster 1 5.069231
## Cluster 2 4.781818
## Cluster 3 5.423077
color <- c("#FCEBB6","#78C0A8","#5E412F")
radarchart(as.data.frame(centros),maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8, cglcol="gray67",
pcol=color,plty=1,plwd=5,title="Comparación de clústeres")
legenda <-legend(1.5,1, legend=c("Cluster 1","Cluster 2","Cluster 3"),
seg.len=-1.4,title="Clústeres",pch=21,bty="n" ,lwd=3, y.intersp=1,
horiz=FALSE,col=color)# En grupos$cluster está a qué cluster pertenece cada fila de la tabla de datos
NDatos<-cbind(datos,Grupo=grupos$cluster)
head(NDatos) Edad.10 Antiguedad Espacios.Parqueo Velocidad.Cajas
Ariana 2.5 1 7.6 7.6
Guiselle 2.4 0 4.8 9.0
Francisco 2.8 7 6.8 8.4
Griselda 2.3 0 3.4 7.8
Damaris 4.9 6 7.0 3.2
Johana 3.2 4 5.6 7.8
Distribucion.Productos Atencion.Empleados
Ariana 7.8 9.7
Guiselle 7.2 10.0
Francisco 7.6 8.7
Griselda 9.0 10.0
Damaris 1.2 10.0
Johana 6.8 10.0
Calidad.Instalaciones Ubicacion Limpieza Variedad.Productos
Ariana 5.0 9.0 7.6 5.6
Guiselle 2.0 9.6 6.8 8.4
Francisco 2.7 9.2 6.2 9.0
Griselda 1.0 10.0 4.4 4.0
Damaris 4.0 9.0 1.4 4.8
Johana 3.0 10.0 5.0 4.2
Prestigio.Empresa Calidad.Servicio Grupo
Ariana 7.0 6.6 3
Guiselle 9.8 5.4 3
Francisco 9.6 8.5 2
Griselda 2.8 5.4 1
Damaris 2.6 3.3 1
Johana 4.2 7.2 1
# Establezco el directorio en donde se quiere grabar el archivo
#setwd("~/Google Drive/MDCurso/Datos")
# Se graba el archivo en como un CSV
write.csv(NDatos,"NDatos.csv")setwd("~/Google Drive/MDCurso/Datos")
datos <- read.csv("EjemploClientesCorregidaEdad.csv",header=TRUE, sep=";", dec=",", row.names=1)
str(datos)## 'data.frame': 37 obs. of 12 variables:
## $ Edad.10 : num 2.5 2.4 2.8 2.3 4.9 3.2 2.6 2.3 2.5 2.9 ...
## $ Antiguedad : int 1 0 7 0 6 4 0 4 4 0 ...
## $ Espacios.Parqueo : num 7.6 4.8 6.8 3.4 7 5.6 6.2 5.6 4.6 5.4 ...
## $ Velocidad.Cajas : num 7.6 9 8.4 7.8 3.2 7.8 8 6.8 8 6.4 ...
## $ Distribucion.Productos: num 7.8 7.2 7.6 9 1.2 6.8 6.6 6.2 3.8 8.8 ...
## $ Atencion.Empleados : num 9.7 10 8.7 10 10 10 9.3 9.7 10 9.7 ...
## $ Calidad.Instalaciones : num 5 2 2.7 1 4 3 3.3 4 1.7 6.7 ...
## $ Ubicacion : num 9 9.6 9.2 10 9 10 8.6 6.8 9.8 10 ...
## $ Limpieza : num 7.6 6.8 6.2 4.4 1.4 5 7.8 6.8 5 5.6 ...
## $ Variedad.Productos : num 5.6 8.4 9 4 4.8 4.2 6.4 7.4 4.4 6.2 ...
## $ Prestigio.Empresa : num 7 9.8 9.6 2.8 2.6 4.2 9.6 5.6 6 8.4 ...
## $ Calidad.Servicio : num 6.6 5.4 8.5 5.4 3.3 7.2 6.5 4.5 7.6 6.5 ...
grupos<-kmeans(datos,centers=3,iter.max=200,nstart=200)
grupos$cluster # Cluster al que pertenece cada fila## Ariana Guiselle Francisco Griselda Damaris Johana
## 1 1 2 3 3 3
## Bernal Freddy Estafania Laura Arnoldo Beatriz
## 1 2 3 1 2 2
## Rebeca Sofia Ingrid Rocio Karen Luis
## 1 2 2 3 2 1
## Pedro Lorena Elena Julian Natalie Shirley
## 1 3 2 3 3 3
## Andres Alejandro Grace Nuria Flor Roberto
## 3 2 3 1 1 2
## Victor Arturo Maritza Diana Juan Guillermo
## 1 2 1 3 3 1
## Silvia
## 1
grupos$centers # Centros de los clústeres## Edad.10 Antiguedad Espacios.Parqueo Velocidad.Cajas
## 1 2.392308 1.307692 6.492308 8.230769
## 2 3.118182 6.181818 6.527273 8.018182
## 3 3.169231 2.769231 5.400000 7.461538
## Distribucion.Productos Atencion.Empleados Calidad.Instalaciones
## 1 8.261538 9.669231 4.284615
## 2 6.927273 9.554545 3.845455
## 3 4.476923 9.692308 2.084615
## Ubicacion Limpieza Variedad.Productos Prestigio.Empresa Calidad.Servicio
## 1 9.215385 7.615385 7.676923 8.430769 5.423077
## 2 8.509091 6.818182 7.200000 7.618182 4.781818
## 3 9.200000 5.646154 5.723077 5.261538 5.069231
# Gráfico Tipo Araña
centros<-grupos$centers
rownames(centros)<-c("Cluster 1","Cluster 2","Cluster 3")
centros<-as.data.frame(centros)
maximos<-apply(centros,2,max)
minimos<-apply(centros,2,min)
centros<-rbind(minimos,centros)
centros<-rbind(maximos,centros)
centros## Edad.10 Antiguedad Espacios.Parqueo Velocidad.Cajas
## 1 3.169231 6.181818 6.527273 8.230769
## 11 2.392308 1.307692 5.400000 7.461538
## Cluster 1 2.392308 1.307692 6.492308 8.230769
## Cluster 2 3.118182 6.181818 6.527273 8.018182
## Cluster 3 3.169231 2.769231 5.400000 7.461538
## Distribucion.Productos Atencion.Empleados Calidad.Instalaciones
## 1 8.261538 9.692308 4.284615
## 11 4.476923 9.554545 2.084615
## Cluster 1 8.261538 9.669231 4.284615
## Cluster 2 6.927273 9.554545 3.845455
## Cluster 3 4.476923 9.692308 2.084615
## Ubicacion Limpieza Variedad.Productos Prestigio.Empresa
## 1 9.215385 7.615385 7.676923 8.430769
## 11 8.509091 5.646154 5.723077 5.261538
## Cluster 1 9.215385 7.615385 7.676923 8.430769
## Cluster 2 8.509091 6.818182 7.200000 7.618182
## Cluster 3 9.200000 5.646154 5.723077 5.261538
## Calidad.Servicio
## 1 5.423077
## 11 4.781818
## Cluster 1 5.423077
## Cluster 2 4.781818
## Cluster 3 5.069231
color <- c("#EEE6AB","#C5BC8E","#5E412F")
radarchart(as.data.frame(centros),maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8, cglcol="gray67",
pcol=color,plty=1,plwd=5,title="Comparación de clústeres")
legenda <-legend(1.5,1, legend=c("Cluster 1","Cluster 2","Cluster 3"),
seg.len=-1.4,title="Clústeres",pch=21,bty="n" ,lwd=3, y.intersp=1,
horiz=FALSE,col=color)setwd("~/Google Drive/MDCurso/Datos")
datos <- read.table('SpamData.csv', header=TRUE, sep=';',dec='.')
datos<-datos[,c(2,4,5,6,7,9,10,11)]
t1<-system.time(grupos<-kmeans(datos,centers=3,iter.max=3,nstart=1))
t1 user system elapsed
0.040 0.000 0.041
par(mfrow=c(1,2))
centros<-grupos$centers
rownames(centros)<-c("Cluster 1","Cluster 2","Cluster 3")
centros<-as.data.frame(centros)
maximos<-apply(centros,2,max)
minimos<-apply(centros,2,min)
centros<-rbind(minimos,centros)
centros<-rbind(maximos,centros)
color <- c("#FCEBB6","#78C0A8","#5E412F")
radarchart(as.data.frame(centros),maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8, cglcol="gray67",
pcol=color,plty=1,plwd=5,title="Comparación de clústeres")
legenda <-legend(1.5,1, legend=c("Cluster 1","Cluster 2","Cluster 3"),
seg.len=-1.4,title="Clústeres",pch=21,bty="n" ,lwd=3, y.intersp=1,
horiz=FALSE,col=color)
t2<-system.time(grupos<-kmeans(datos,centers=3,iter.max=200,nstart = 200))
t2 user system elapsed
0.517 0.009 0.528
centros<-grupos$centers
rownames(centros)<-c("Cluster 1","Cluster 2","Cluster 3")
centros<-as.data.frame(centros)
maximos<-apply(centros,2,max)
minimos<-apply(centros,2,min)
centros<-rbind(minimos,centros)
centros<-rbind(maximos,centros)
radarchart(as.data.frame(centros),maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8, cglcol="gray67",
pcol=color,plty=1,plwd=5,title="Comparación de clústeres")
legenda <-legend(1.5,1, legend=c("Cluster 1","Cluster 2","Cluster 3"),
seg.len=-1.4,title="Clústeres",pch=21,bty="n" ,lwd=3, y.intersp=1,
horiz=FALSE,col=color)generate <- function(n=50, extradim=0, sigma=1, mu=7) {
data1 <- matrix(rnorm(n*2), n, 2) * sigma
data1[,1] <- data1[,1] + centers[1,1] * mu
data1[,2] <- data1[,2] + centers[1,2] * mu
data2 <- matrix(rnorm(n*2), n, 2) * sigma
data2[,1] <- data2[,1] + centers[2,1] * mu
data2[,2] <- data2[,2] + centers[2,2] * mu
data3 <- matrix(rnorm(n*2), n, 2) * sigma
data3[,1] <- data3[,1] + centers[3,1] * mu
data3[,2] <- data3[,2] + centers[3,2] * mu
data <- rbind(data1,data2,data3)
if (extradim > 0) {
noise <- matrix(rnorm(3*n*extradim)*sigma, 3*n, extradim)
data <- cbind(data, noise)
}
return(data)
}centers <- matrix(c(0,3,1,3,0,4), 3, 2, byrow=T)
centers [,1] [,2]
[1,] 0 3
[2,] 1 3
[3,] 0 4
Data1 <- generate(extradim=0)
head(Data1) [,1] [,2]
[1,] 1.1075277 20.93870
[2,] -0.6916751 21.89525
[3,] 0.1957096 20.97194
[4,] 0.2922544 20.00025
[5,] -1.0518964 20.84040
[6,] -1.5096944 20.47231
dim(Data1)[1] 150 2
groups<-kmeans(Data1,centers=3,iter.max=100,nstart=50)plot(Data1)
points(groups$centers,pch=19,col="blue",cex=2)
points(Data1,col=groups$cluster,pch=19)mydata <- data.frame( (Data1), groups$cluster)
clusplot(mydata, groups$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)library(scatterplot3d)
Data2 <- generate(extradim=1)
g3d<-scatterplot3d(Data2)
groups<-kmeans(Data2, 3)
g3d$points3d(groups$centers,pch=19,col="blue",cex=2)
g3d$points3d(Data2,col=groups$cluster,pch=19)Datos <- generate(extradim=0)
InerciaIC<-rep(0,30)
for(k in 1:30) {
grupos<-kmeans(Datos,centers=k,nstart=25)
InerciaIC[k]<-grupos$tot.withinss
}
plot(InerciaIC,col="blue",type="b")mis.datos <- scale(Datos)
fviz_nbclust(mis.datos, kmeans, method = "gap_stat")setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.csv("EjemploClientesCorregidaEdad.csv",header=TRUE, sep=";", dec=",", row.names=1)InerciaIC<-rep(0,30)
for(k in 1:30) {
grupos<-kmeans(Datos,centers=k,nstart=25)
InerciaIC[k]<-grupos$tot.withinss
}
plot(InerciaIC,col="blue",type="b")mis.datos <- scale(Datos)
fviz_nbclust(mis.datos, kmeans, method = "gap_stat")La función “daisy” de la biblioteca “cluster” permite calcular la matriz de distancias en tablas de datos cuyas variables están mezcladas entre variables cualtitativas y cuantitativas.
library(cluster)
setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.csv("SAheart.csv",header=TRUE, sep=";", dec=".")
str(Datos)## 'data.frame': 462 obs. of 10 variables:
## $ sbp : int 160 144 118 170 134 132 142 114 114 132 ...
## $ tobacco : num 12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
## $ ldl : num 5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
## $ adiposity: num 23.1 28.6 32.3 38 27.8 ...
## $ famhist : Factor w/ 2 levels "Absent","Present": 2 1 2 2 2 2 1 2 2 2 ...
## $ typea : int 49 55 52 51 60 62 59 62 49 69 ...
## $ obesity : num 25.3 28.9 29.1 32 26 ...
## $ alcohol : num 97.2 2.06 3.81 24.26 57.34 ...
## $ age : int 52 63 46 58 49 45 38 58 29 53 ...
## $ chd : Factor w/ 2 levels "No","Si": 2 2 1 2 2 1 1 2 1 2 ...
dim(Datos)## [1] 462 10
head(Datos)## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 1 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 Si
## 2 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 Si
## 3 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 No
## 4 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 Si
## 5 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 Si
## 6 132 6.20 6.47 36.21 Present 62 30.77 14.14 45 No
D<-daisy(Datos, metric = "euclidean")## Warning in daisy(Datos, metric = "euclidean"): with mixed variables, metric
## "gower" is used automatically
jer<-hclust(D, method = "complete")plot(jer)
rect.hclust(jer, k = 4, border = "red")fviz_dend(jer, k = 4, cex = 1.3, color_labels_by_k = FALSE, rect = TRUE,k_colors = c("#515151", "#F38630", "#00B4FF", "#ECD078"), ggtheme = mi.tema, show_labels = F)grupo<-cutree(jer, k = 4)
NDatos<-cbind(Datos,grupo)
head(NDatos)## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd grupo
## 1 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 Si 1
## 2 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 Si 2
## 3 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 No 3
## 4 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 Si 1
## 5 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 Si 1
## 6 132 6.20 6.47 36.21 Present 62 30.77 14.14 45 No 3
# Se deben quitar las variables cualitativas para hacer un gráfico tipo araña
setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.csv("SAheart.csv",header=TRUE, sep=";", dec=".")
str(Datos)## 'data.frame': 462 obs. of 10 variables:
## $ sbp : int 160 144 118 170 134 132 142 114 114 132 ...
## $ tobacco : num 12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
## $ ldl : num 5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
## $ adiposity: num 23.1 28.6 32.3 38 27.8 ...
## $ famhist : Factor w/ 2 levels "Absent","Present": 2 1 2 2 2 2 1 2 2 2 ...
## $ typea : int 49 55 52 51 60 62 59 62 49 69 ...
## $ obesity : num 25.3 28.9 29.1 32 26 ...
## $ alcohol : num 97.2 2.06 3.81 24.26 57.34 ...
## $ age : int 52 63 46 58 49 45 38 58 29 53 ...
## $ chd : Factor w/ 2 levels "No","Si": 2 2 1 2 2 1 1 2 1 2 ...
D<-daisy(Datos, metric = "euclidean")## Warning in daisy(Datos, metric = "euclidean"): with mixed variables, metric
## "gower" is used automatically
jer<-hclust(D, method = "complete")
grupos <- cutree(jer, k=3)
centro.cluster1<-centroide(1,Datos[,-c(5,10)],grupos)
centro.cluster2<-centroide(2,Datos[,-c(5,10)],grupos)
centro.cluster3<-centroide(3,Datos[,-c(5,10)],grupos)
centros<-rbind(centro.cluster1,centro.cluster2,centro.cluster3)
centros<-as.data.frame(centros)
maximos<-apply(centros,2,max)
minimos<-apply(centros,2,min)
centros<-rbind(minimos,centros)
centros<-rbind(maximos,centros)
centros## sbp tobacco ldl adiposity typea obesity
## 1 144.3229 5.913281 5.866771 28.69698 54.56250 26.83281
## 11 135.4603 2.634735 4.344238 23.96911 52.36755 25.73745
## centro.cluster1 144.3229 5.265937 5.866771 28.69698 54.44792 26.83281
## centro.cluster2 142.8594 5.913281 4.919688 27.25516 54.56250 26.30813
## centro.cluster3 135.4603 2.634735 4.344238 23.96911 52.36755 25.73745
## alcohol age
## 1 21.06146 51.55208
## 11 15.93136 38.85430
## centro.cluster1 21.06146 51.55208
## centro.cluster2 16.27094 48.40625
## centro.cluster3 15.93136 38.85430
color <- c("#FCEBB6","#78C0A8","#5E412F")
radarchart(as.data.frame(centros),maxmin=TRUE,axistype=4,axislabcol="slategray4",
centerzero=FALSE,seg=8, cglcol="gray67",
pcol=color,plty=1,plwd=5,title="Comparación de clústeres")
legenda <-legend(1.5,1, legend=c("Cluster 1","Cluster 2","Cluster 3"),
seg.len=-1.4,title="Clústeres",pch=21,bty="n" ,lwd=3, y.intersp=1,
horiz=FALSE,col=color)grupo <- cutree(jer, k = 3)
NDatos <- cbind(Datos, grupo)
cluster <- NDatos$grupo
sel.cluster1 <- match(cluster, 1, 0)
sel.cluster1[1:10]## [1] 1 0 0 1 1 0 0 1 0 1
Datos.Cluster1 <- NDatos[sel.cluster1 > 0,]
dim(Datos.Cluster1)## [1] 96 11
sel.cluster2 <- match(cluster, 2, 0)
Datos.Cluster2 <- NDatos[sel.cluster2 > 0,]
dim(Datos.Cluster2)## [1] 64 11
sel.cluster3 <- match(cluster, 3, 0)
Datos.Cluster3 <- NDatos[sel.cluster3 > 0,]
dim(Datos.Cluster3)## [1] 302 11
color1 <- c("#ECD078","#D95B43")
plot(Datos$famhist, col = color1, las = 2, main = "Variable famhist", xlab = "Todos los Datos")plot(Datos.Cluster1$famhist, col = color1, las = 2, main = "Variable famhist", xlab = "Cluster 1")plot(Datos.Cluster2$famhist, col = color1, las = 2, main = "Variable famhist", xlab = "Cluster 2")plot(Datos.Cluster3$famhist, col = color1, las = 2, main = "Variable famhist", xlab = "Cluster 3")color2 <- c("#45ADA8","#9DE0AD")
plot(Datos$chd, col = color2, las = 2, main = "Variable chd", xlab = "Todos los Datos")plot(Datos.Cluster1$chd, col = color2, las = 2, main = "Variable chd", xlab = "Cluster 1")plot(Datos.Cluster2$chd, col = color2, las = 2, main = "Variable chd", xlab = "Cluster 2")plot(Datos.Cluster3$chd, col = color2, las = 2, main = "Variable chd", xlab = "Cluster 3")library(cluster)
setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.csv("SAheart.csv",header=TRUE, sep=";", dec=".")
str(Datos)'data.frame': 462 obs. of 10 variables:
$ sbp : int 160 144 118 170 134 132 142 114 114 132 ...
$ tobacco : num 12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
$ ldl : num 5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
$ adiposity: num 23.1 28.6 32.3 38 27.8 ...
$ famhist : Factor w/ 2 levels "Absent","Present": 2 1 2 2 2 2 1 2 2 2 ...
$ typea : int 49 55 52 51 60 62 59 62 49 69 ...
$ obesity : num 25.3 28.9 29.1 32 26 ...
$ alcohol : num 97.2 2.06 3.81 24.26 57.34 ...
$ age : int 52 63 46 58 49 45 38 58 29 53 ...
$ chd : Factor w/ 2 levels "No","Si": 2 2 1 2 2 1 1 2 1 2 ...
dim(Datos)[1] 462 10
head(Datos) sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
1 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 Si
2 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 Si
3 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 No
4 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 Si
5 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 Si
6 132 6.20 6.47 36.21 Present 62 30.77 14.14 45 No
D<-daisy(Datos, metric = "euclidean")Warning in daisy(Datos, metric = "euclidean"): with mixed variables, metric
"gower" is used automatically
res <- cmdscale(D,eig=TRUE, k=2) # k es el número de componentes a usas
# Plotear la solución
x <- res$points[,1]
y <- res$points[,2]
plot(x, y, xlab="Componente 1", ylab="Componente 2",main="MDS", type="p")#text(x, y, labels = row.names(Datos), cex=.7) setwd("~/Google Drive/MDCurso/Datos")
Datos <- read.csv("SAheart.csv",header=TRUE, sep=";", dec=".")
str(Datos)'data.frame': 462 obs. of 10 variables:
$ sbp : int 160 144 118 170 134 132 142 114 114 132 ...
$ tobacco : num 12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
$ ldl : num 5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
$ adiposity: num 23.1 28.6 32.3 38 27.8 ...
$ famhist : Factor w/ 2 levels "Absent","Present": 2 1 2 2 2 2 1 2 2 2 ...
$ typea : int 49 55 52 51 60 62 59 62 49 69 ...
$ obesity : num 25.3 28.9 29.1 32 26 ...
$ alcohol : num 97.2 2.06 3.81 24.26 57.34 ...
$ age : int 52 63 46 58 49 45 38 58 29 53 ...
$ chd : Factor w/ 2 levels "No","Si": 2 2 1 2 2 1 1 2 1 2 ...
dim(Datos)[1] 462 10
head(Datos) sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
1 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 Si
2 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 Si
3 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 No
4 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 Si
5 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 Si
6 132 6.20 6.47 36.21 Present 62 30.77 14.14 45 No
famhist.Present <- as.numeric(Datos$famhist == "Present")
print(famhist.Present) [1] 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 0 1 0 0
[36] 0 0 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 1 0 1 0
[71] 0 0 0 1 0 0 0 1 0 1 1 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0
[106] 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 1
[141] 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1
[176] 1 1 0 0 0 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1 0 1
[211] 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1
[246] 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 1 1 0 0 0
[281] 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 1 1 0 1 0 0
[316] 0 1 0 1 0 1 1 0 1 0 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 1 0
[351] 1 1 1 1 1 1 0 0 1 0 1 0 1 1 1 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 1 1 0 0
[386] 1 1 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0 1 1 0 1 1 1 0 0 0 0 0 1 1 1 0 1 1 0
[421] 0 0 1 1 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 1 1 1 0 0 1 1 0
[456] 0 1 0 0 0 0 1
famhist.Absent <- as.numeric(Datos$famhist == "Absent")
print(famhist.Absent) [1] 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 1 1 0 1 1 0 1 1 1 0 1 1
[36] 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 1 0 1 1 1 1 1 0 1 0 0 1 1 0 0 1 0 1
[71] 1 1 1 0 1 1 1 0 1 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 0 0 0 1 0 1 0 1 1
[106] 1 1 0 1 1 1 0 1 0 1 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 0 1 0 1 0
[141] 1 0 1 1 1 1 0 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 0
[176] 0 0 1 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 1 1 0 1 1 0 1 1 0 1 0 0 1 0
[211] 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 0 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0
[246] 1 0 0 1 0 1 1 1 0 0 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 0 1 1 1
[281] 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 0 0 1 1 0 0 1 0 1 1
[316] 1 0 1 0 1 0 0 1 0 1 1 0 1 1 1 1 0 0 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1
[351] 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 0 0 1 1
[386] 0 0 1 1 0 0 0 1 1 1 1 0 1 0 1 1 1 0 0 1 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1
[421] 1 1 0 0 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 0 1 0 0 0 1 1 0 0 1
[456] 1 0 1 1 1 1 0
chd.Si <- as.numeric(Datos$chd == "Si")
print(chd.Si) [1] 1 1 0 1 1 0 0 1 0 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 1 1 1 1 0
[36] 1 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1
[71] 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0
[106] 0 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 1 0 0 0 0
[141] 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0
[176] 1 0 0 0 0 0 0 1 1 1 1 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0
[211] 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0 1 1 1 1 1 0 1 0 0 1 0 0 0 0 1 0 0 1 1
[246] 0 1 0 0 1 0 0 1 0 0 1 1 0 1 0 1 0 0 0 1 0 0 0 0 1 1 1 0 0 1 1 1 0 0 0
[281] 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0
[316] 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0
[351] 0 0 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0
[386] 1 0 1 0 1 1 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 1 0 1 1 0 1 0 0 0 0
[421] 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1
[456] 1 0 0 1 0 0 1
chd.No <- as.numeric(Datos$chd == "No")
print(chd.No) [1] 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 1
[36] 0 1 1 1 0 0 1 1 0 1 1 0 0 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0
[71] 1 1 1 1 1 1 1 0 0 1 0 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1
[106] 1 0 0 1 1 1 0 1 0 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 1 0 0 1 1 0 1 1 1 1
[141] 0 0 1 1 1 1 1 0 0 0 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1
[176] 0 1 1 1 1 1 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1
[211] 1 0 1 1 1 0 0 1 1 1 1 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 1 1 1 1 0 1 1 0 0
[246] 1 0 1 1 0 1 1 0 1 1 0 0 1 0 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 0 0 0 1 1 1
[281] 0 1 0 1 0 1 1 1 1 0 1 1 1 0 1 0 1 1 0 1 1 0 1 0 1 1 1 0 1 1 0 1 0 1 1
[316] 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 0 1 0 1 1 1 1 0 0 1 1 0 0 1 0 1
[351] 1 1 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1
[386] 0 1 0 1 0 0 0 1 1 1 1 1 0 0 1 1 1 0 0 1 1 0 0 1 1 0 1 0 0 1 0 1 1 1 1
[421] 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 0
[456] 0 1 1 0 1 1 0
Datos2<-Datos[,-c(5,10)]
Datos2<-cbind(Datos2,famhist.Present)
Datos2<-cbind(Datos2,famhist.Absent)
Datos2<-cbind(Datos2,chd.Si)
Datos2<-cbind(Datos2,chd.No)
str(Datos2)'data.frame': 462 obs. of 12 variables:
$ sbp : int 160 144 118 170 134 132 142 114 114 132 ...
$ tobacco : num 12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
$ ldl : num 5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
$ adiposity : num 23.1 28.6 32.3 38 27.8 ...
$ typea : int 49 55 52 51 60 62 59 62 49 69 ...
$ obesity : num 25.3 28.9 29.1 32 26 ...
$ alcohol : num 97.2 2.06 3.81 24.26 57.34 ...
$ age : int 52 63 46 58 49 45 38 58 29 53 ...
$ famhist.Present: num 1 0 1 1 1 1 0 1 1 1 ...
$ famhist.Absent : num 0 1 0 0 0 0 1 0 0 0 ...
$ chd.Si : num 1 1 0 1 1 0 0 1 0 1 ...
$ chd.No : num 0 0 1 0 0 1 1 0 1 0 ...
dim(Datos2)[1] 462 12
head(Datos2) sbp tobacco ldl adiposity typea obesity alcohol age famhist.Present
1 160 12.00 5.73 23.11 49 25.30 97.20 52 1
2 144 0.01 4.41 28.61 55 28.87 2.06 63 0
3 118 0.08 3.48 32.28 52 29.14 3.81 46 1
4 170 7.50 6.41 38.03 51 31.99 24.26 58 1
5 134 13.60 3.50 27.78 60 25.99 57.34 49 1
6 132 6.20 6.47 36.21 62 30.77 14.14 45 1
famhist.Absent chd.Si chd.No
1 0 1 0
2 1 1 0
3 0 0 1
4 0 1 0
5 0 1 0
6 0 0 1
library(FactoMineR)
res<-PCA(Datos2, scale.unit=TRUE, ncp=5, graph = FALSE)
plot(res, axes=c(1, 2), choix="ind", col.ind="red",new.plot=TRUE)plot(res, axes=c(1, 2), choix="var", col.var="blue",new.plot=TRUE)fviz_pca_ind(res, pointsize = 3, pointshape = 21,label="none", fill = "#E7B800", repel = TRUE,ggtheme = mi.tema)fviz_pca_var(res,col.var="steelblue",ggtheme = mi.tema)